In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from warnings import filterwarnings 
filterwarnings("ignore")
import plotly.express as px
%matplotlib inline
sns.set_style('whitegrid')
plt.style.use('fivethirtyeight')              
D:\anaconda files\lib\site-packages\scipy\__init__.py:155: UserWarning: A NumPy version >=1.18.5 and <1.25.0 is required for this version of SciPy (detected version 1.26.4
  warnings.warn(f"A NumPy version >={np_minversion} and <{np_maxversion}"
In [2]:
data=pd.read_csv(r"C:\\Users\\laxma\\Downloads\\KAG_conversion_data.csv")
data
Out[2]:
ad_id xyz_campaign_id fb_campaign_id age gender interest Impressions Clicks Spent Total_Conversion Approved_Conversion
0 708746 916 103916 30-34 M 15 7350 1 1.430000 2 1
1 708749 916 103917 30-34 M 16 17861 2 1.820000 2 0
2 708771 916 103920 30-34 M 20 693 0 0.000000 1 0
3 708815 916 103928 30-34 M 28 4259 1 1.250000 1 0
4 708818 916 103928 30-34 M 28 4133 1 1.290000 1 1
... ... ... ... ... ... ... ... ... ... ... ...
1138 1314410 1178 179977 45-49 F 109 1129773 252 358.189997 13 2
1139 1314411 1178 179978 45-49 F 110 637549 120 173.880003 3 0
1140 1314412 1178 179979 45-49 F 111 151531 28 40.289999 2 0
1141 1314414 1178 179981 45-49 F 113 790253 135 198.710001 8 2
1142 1314415 1178 179982 45-49 F 114 513161 114 165.609999 5 2

1143 rows × 11 columns

In [3]:
data.head()
Out[3]:
ad_id xyz_campaign_id fb_campaign_id age gender interest Impressions Clicks Spent Total_Conversion Approved_Conversion
0 708746 916 103916 30-34 M 15 7350 1 1.43 2 1
1 708749 916 103917 30-34 M 16 17861 2 1.82 2 0
2 708771 916 103920 30-34 M 20 693 0 0.00 1 0
3 708815 916 103928 30-34 M 28 4259 1 1.25 1 0
4 708818 916 103928 30-34 M 28 4133 1 1.29 1 1
In [4]:
data.tail()
Out[4]:
ad_id xyz_campaign_id fb_campaign_id age gender interest Impressions Clicks Spent Total_Conversion Approved_Conversion
1138 1314410 1178 179977 45-49 F 109 1129773 252 358.189997 13 2
1139 1314411 1178 179978 45-49 F 110 637549 120 173.880003 3 0
1140 1314412 1178 179979 45-49 F 111 151531 28 40.289999 2 0
1141 1314414 1178 179981 45-49 F 113 790253 135 198.710001 8 2
1142 1314415 1178 179982 45-49 F 114 513161 114 165.609999 5 2
In [5]:
data.describe()
Out[5]:
ad_id xyz_campaign_id fb_campaign_id interest Impressions Clicks Spent Total_Conversion Approved_Conversion
count 1.143000e+03 1143.000000 1143.000000 1143.000000 1.143000e+03 1143.000000 1143.000000 1143.000000 1143.000000
mean 9.872611e+05 1067.382327 133783.989501 32.766404 1.867321e+05 33.390201 51.360656 2.855643 0.944007
std 1.939928e+05 121.629393 20500.308622 26.952131 3.127622e+05 56.892438 86.908418 4.483593 1.737708
min 7.087460e+05 916.000000 103916.000000 2.000000 8.700000e+01 0.000000 0.000000 0.000000 0.000000
25% 7.776325e+05 936.000000 115716.000000 16.000000 6.503500e+03 1.000000 1.480000 1.000000 0.000000
50% 1.121185e+06 1178.000000 144549.000000 25.000000 5.150900e+04 8.000000 12.370000 1.000000 1.000000
75% 1.121804e+06 1178.000000 144657.500000 31.000000 2.217690e+05 37.500000 60.025000 3.000000 1.000000
max 1.314415e+06 1178.000000 179982.000000 114.000000 3.052003e+06 421.000000 639.949998 60.000000 21.000000
In [6]:
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1143 entries, 0 to 1142
Data columns (total 11 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ad_id                1143 non-null   int64  
 1   xyz_campaign_id      1143 non-null   int64  
 2   fb_campaign_id       1143 non-null   int64  
 3   age                  1143 non-null   object 
 4   gender               1143 non-null   object 
 5   interest             1143 non-null   int64  
 6   Impressions          1143 non-null   int64  
 7   Clicks               1143 non-null   int64  
 8   Spent                1143 non-null   float64
 9   Total_Conversion     1143 non-null   int64  
 10  Approved_Conversion  1143 non-null   int64  
dtypes: float64(1), int64(8), object(2)
memory usage: 98.4+ KB
In [7]:
data.isnull().sum()
Out[7]:
ad_id                  0
xyz_campaign_id        0
fb_campaign_id         0
age                    0
gender                 0
interest               0
Impressions            0
Clicks                 0
Spent                  0
Total_Conversion       0
Approved_Conversion    0
dtype: int64
In [8]:
data=data.dropna()
data
Out[8]:
ad_id xyz_campaign_id fb_campaign_id age gender interest Impressions Clicks Spent Total_Conversion Approved_Conversion
0 708746 916 103916 30-34 M 15 7350 1 1.430000 2 1
1 708749 916 103917 30-34 M 16 17861 2 1.820000 2 0
2 708771 916 103920 30-34 M 20 693 0 0.000000 1 0
3 708815 916 103928 30-34 M 28 4259 1 1.250000 1 0
4 708818 916 103928 30-34 M 28 4133 1 1.290000 1 1
... ... ... ... ... ... ... ... ... ... ... ...
1138 1314410 1178 179977 45-49 F 109 1129773 252 358.189997 13 2
1139 1314411 1178 179978 45-49 F 110 637549 120 173.880003 3 0
1140 1314412 1178 179979 45-49 F 111 151531 28 40.289999 2 0
1141 1314414 1178 179981 45-49 F 113 790253 135 198.710001 8 2
1142 1314415 1178 179982 45-49 F 114 513161 114 165.609999 5 2

1143 rows × 11 columns

In [9]:
data.isnull().sum()
Out[9]:
ad_id                  0
xyz_campaign_id        0
fb_campaign_id         0
age                    0
gender                 0
interest               0
Impressions            0
Clicks                 0
Spent                  0
Total_Conversion       0
Approved_Conversion    0
dtype: int64
In [10]:
data.duplicated().sum()
Out[10]:
0
In [11]:
#VISUALIZATION
In [12]:
plt.bar(data['gender'],data['Total_Conversion'])
plt.xticks(rotation=90)
plt.show()
In [13]:
fig=px.bar(data,x='interest',y='Impressions',color='interest')
fig.show()
In [14]:
fig=px.violin(data,x='Approved_Conversion',y='fb_campaign_id',color='Approved_Conversion')
fig.show()
In [15]:
fig=px.bar(data,x='Total_Conversion',y='Clicks',color='Clicks')
fig.show()
In [16]:
plt.scatter(data['Spent'],data['Total_Conversion'],color='red')
plt.xticks(rotation=90)
plt.show()
In [17]:
plt.figure(figsize=(10,4))
sns.countplot(x='interest', data=data, color='cyan')
plt.xticks(rotation=90)
plt.show()
In [18]:
sns.barplot(data['Approved_Conversion'],data['fb_campaign_id'],color='r')
plt.xticks(rotation=90)
plt.show()
In [19]:
sns.lineplot(x='Approved_Conversion', y='ad_id', data=data).set_title('Approved_Conversion by ad_id')
Out[19]:
Text(0.5, 1.0, 'Approved_Conversion by ad_id')
In [20]:
plt.figure(figsize=(8, 6))
data.age.hist(bins=data.age.nunique())
plt.xlabel('age')
Out[20]:
Text(0.5, 0, 'age')
In [21]:
sns.relplot(x='Clicks',y='ad_id',data=data)
Out[21]:
<seaborn.axisgrid.FacetGrid at 0x1e0e68876a0>
In [22]:
sns.displot(data["age"])
Out[22]:
<seaborn.axisgrid.FacetGrid at 0x1e0e68b3760>
In [ ]:
 
In [23]:
plt.figure(figsize=(8, 6))
data.age.hist(bins=data.age.nunique())
plt.xlabel('age')
Out[23]:
Text(0.5, 0, 'age')
In [24]:
plt.figure(figsize=(8,6))
sns.jointplot(x=data["Spent"],y=data['age'])
Out[24]:
<seaborn.axisgrid.JointGrid at 0x1e0e64a6a30>
<Figure size 800x600 with 0 Axes>
In [25]:
plt.figure(figsize=(8,6))
sns.jointplot(x=data["interest"],y=data.Total_Conversion, kind='kde')
Out[25]:
<seaborn.axisgrid.JointGrid at 0x1e0e64c4040>
<Figure size 800x600 with 0 Axes>
In [26]:
plt.figure(figsize=(8,6))
sns.jointplot(x=data["fb_campaign_id"],y=data["Approved_Conversion"])
Out[26]:
<seaborn.axisgrid.JointGrid at 0x1e0e6570cd0>
<Figure size 800x600 with 0 Axes>
In [27]:
sns.pairplot(data)
Out[27]:
<seaborn.axisgrid.PairGrid at 0x1e0e3b44f10>
In [28]:
#MODEL BUILDING
In [29]:
data['Clicks'].value_counts()
Out[29]:
0      207
1      119
2       70
3       51
4       34
      ... 
421      1
119      1
111      1
45       1
252      1
Name: Clicks, Length: 183, dtype: int64
In [30]:
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
 
def print_score(clf, X_train, y_train, X_test, y_test, train=True):
    if train:
        pred=clf.predict(X_train)
        clf_report=pd.DataFrame(classification_report(y_train, pred, output_dict=True))
        print("train result:\n")
        print(f"accuracy score:{accuracy_score(y_train,pred)*100:.2f}%")
        print(f"CLASSIFICATION REPORT:\n,{clf_report}")
        print(f"\nconfusion matrix:\n{confusion_matrix(y_train,pred)}\n")
            
    elif train==False:
        pred=clf.predict(X_test)
        clf_report=pd.DataFrame(classification_report(y_test, pred, output_dict=True))
        print("train result:\n")
        print(f"accuracy score:{accuracy_score(y_test,pred)*100:.2f}%")
        print(f"CLASSIFICATION REPORT:\n,{clf_report}")
        print(f"\nconfusion matrix:\n{confusion_matrix(y_test,pred)}\n")
In [31]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OrdinalEncoder
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split

X = data.drop(['ad_id', 'xyz_campaign_id', 'fb_campaign_id', 'age', 'gender'], axis=1)
y = data['Clicks']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
 
    
num_columns = ['interest', 'Clicks', 'Approved_Conversion', 'Total_Conversion', 'Impressions']

ct = make_column_transformer(
     (MinMaxScaler(), num_columns),
     (StandardScaler(), num_columns),
    remainder = 'passthrough'
)

X_train = ct.fit_transform(X_train)
X_test = ct.transform(X_test)
In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

lr_clf=LogisticRegression(solver='liblinear')
lr_clf.fit(X_train, y_train)

print_score(lr_clf, X_train, y_train, X_test, y_test, train=True)
print_score(lr_clf, X_train, y_train, X_test, y_test, train=False)
train result:

accuracy score:30.88%
CLASSIFICATION REPORT:
,                    0          1     2     3     4          5     6     7  \
precision    0.965753   0.420455   0.0   0.0   0.0   0.097561   0.0   0.0   
recall       1.000000   0.936709   0.0   0.0   0.0   0.148148   0.0   0.0   
f1-score     0.982578   0.580392   0.0   0.0   0.0   0.117647   0.0   0.0   
support    141.000000  79.000000  49.0  38.0  25.0  27.000000  12.0  21.0   

              8    9  ...  272  276  282  295  346       353       421  \
precision   0.0  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.333333  0.333333   
recall      0.0  0.0  ...  0.0  0.0  0.0  0.0  0.0  1.000000  1.000000   
f1-score    0.0  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.500000  0.500000   
support    10.0  7.0  ...  1.0  1.0  1.0  1.0  1.0  1.000000  1.000000   

           accuracy   macro avg  weighted avg  
precision   0.30875    0.056544      0.238131  
recall      0.30875    0.088461      0.308750  
f1-score    0.30875    0.056183      0.254989  
support     0.30875  800.000000    800.000000  

[4 rows x 161 columns]

confusion matrix:
[[141   0   0 ...   0   0   0]
 [  5  74   0 ...   0   0   0]
 [  0  48   0 ...   0   0   0]
 ...
 [  0   0   0 ...   0   0   0]
 [  0   0   0 ...   0   1   0]
 [  0   0   0 ...   0   0   1]]

train result:

accuracy score:30.03%
CLASSIFICATION REPORT:
,                   0          1     2     3    4         5    6    7    8  \
precision   0.929577   0.492958   0.0   0.0  0.0  0.055556  0.0  0.0  0.0   
recall      1.000000   0.875000   0.0   0.0  0.0  0.142857  0.0  0.0  0.0   
f1-score    0.963504   0.630631   0.0   0.0  0.0  0.080000  0.0  0.0  0.0   
support    66.000000  40.000000  21.0  13.0  9.0  7.000000  6.0  8.0  4.0   

                   9  ...  233  235  245  247  340  353  367  accuracy  \
precision   1.000000  ...  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.300292   
recall      0.100000  ...  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.300292   
f1-score    0.181818  ...  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.300292   
support    10.000000  ...  1.0  1.0  1.0  1.0  1.0  0.0  1.0  0.300292   

            macro avg  weighted avg  
precision    0.022945      0.266645  
recall       0.019610      0.300292  
f1-score     0.017185      0.265874  
support    343.000000    343.000000  

[4 rows x 111 columns]

confusion matrix:
[[66  0  0 ...  0  0  0]
 [ 5 35  0 ...  0  0  0]
 [ 0 18  0 ...  0  0  0]
 ...
 [ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]]

In [ ]: